20회 복기
library(tidyverse)
library(data.table)
library(lubridate)
library(caret)
library(recipes)
library(rsample)
library(forecast)temp <- fread("data/temps.csv")
temp %>% str()## Classes 'data.table' and 'data.frame': 348 obs. of 12 variables:
## $ year : int 2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
## $ month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ day : int 1 2 3 4 5 6 7 8 9 10 ...
## $ week : chr "Fri" "Sat" "Sun" "Mon" ...
## $ temp_2 : int 45 44 45 44 41 40 44 51 45 48 ...
## $ temp_1 : int 45 45 44 41 40 44 51 45 48 50 ...
## $ average : num 45.6 45.7 45.8 45.9 46 46.1 46.2 46.3 46.4 46.5 ...
## $ actual : int 45 44 41 40 44 51 45 48 50 52 ...
## $ forecast_noaa : int 43 41 43 44 46 43 45 43 46 45 ...
## $ forecast_acc : int 50 50 46 48 46 49 49 47 50 48 ...
## $ forecast_under: int 44 44 47 46 46 48 46 46 45 48 ...
## $ friend : int 29 61 56 53 41 40 38 34 47 49 ...
## - attr(*, ".internal.selfref")=<externalptr>
결측값 처리
temp %>% is.na() %>% colSums()## year month day week temp_2
## 0 0 0 0 0
## temp_1 average actual forecast_noaa forecast_acc
## 0 0 0 0 0
## forecast_under friend
## 0 0
temp %>% dim()## [1] 348 12
temp <- temp %>%
mutate(date = make_date(year, month, day))
date_t <- seq(ymd("2016-01-01"), ymd("2016-12-31"), by = "1 day")
date_t %>% length()## [1] 366
temp %>% dim()## [1] 348 13
temp1 <- date_t %>%
as_tibble() %>%
left_join(temp, by = c("value" = "date")) %>%
rename(date = value)
temp1 %>% is.na() %>% colSums()## date year month day week
## 0 18 18 18 18
## temp_2 temp_1 average actual forecast_noaa
## 18 18 18 18 18
## forecast_acc forecast_under friend
## 18 18 18
lag 값 비교
temp1 %>% head()## # A tibble: 6 x 13
## date year month day week temp_2 temp_1 average actual forecast_noaa
## <date> <int> <int> <int> <chr> <int> <int> <dbl> <int> <int>
## 1 2016-01-01 2016 1 1 Fri 45 45 45.6 45 43
## 2 2016-01-02 2016 1 2 Sat 44 45 45.7 44 41
## 3 2016-01-03 2016 1 3 Sun 45 44 45.8 41 43
## 4 2016-01-04 2016 1 4 Mon 44 41 45.9 40 44
## 5 2016-01-05 2016 1 5 Tues 41 40 46 44 46
## 6 2016-01-06 2016 1 6 Wed 40 44 46.1 51 43
## # ... with 3 more variables: forecast_acc <int>, forecast_under <int>,
## # friend <int>
temp1 %>% tail()## # A tibble: 6 x 13
## date year month day week temp_2 temp_1 average actual forecast_noaa
## <date> <int> <int> <int> <chr> <int> <int> <dbl> <int> <int>
## 1 2016-12-26 2016 12 26 Mon 41 42 45.2 42 45
## 2 2016-12-27 2016 12 27 Tues 42 42 45.2 47 41
## 3 2016-12-28 2016 12 28 Wed 42 47 45.3 48 41
## 4 2016-12-29 2016 12 29 Thurs 47 48 45.3 48 43
## 5 2016-12-30 2016 12 30 Fri 48 48 45.4 57 44
## 6 2016-12-31 2016 12 31 Sat 48 57 45.5 40 42
## # ... with 3 more variables: forecast_acc <int>, forecast_under <int>,
## # friend <int>
lag1 <- temp1$temp_1
lag1_1 <- temp1$actual %>%
lag(n = 1) %>%
replace_na(45)
lag1 == lag1_1## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [13] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [25] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [37] TRUE TRUE TRUE TRUE TRUE TRUE TRUE NA NA FALSE TRUE TRUE
## [49] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE NA
## [61] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [73] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [85] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [97] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [109] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [133] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [145] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [157] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [169] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [181] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [193] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [205] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [217] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [229] TRUE NA NA NA NA NA NA FALSE NA NA NA NA
## [241] FALSE NA FALSE NA NA NA FALSE TRUE TRUE TRUE TRUE TRUE
## [253] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [265] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [277] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [289] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [301] TRUE TRUE TRUE NA FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [313] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [325] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [337] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [349] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [361] TRUE TRUE TRUE TRUE TRUE TRUE
lag2 <- temp1$temp_2
lag2_1 <- temp1$actual %>%
lag(n = 2)
lag2 == lag2_1## [1] NA NA TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [13] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [25] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [37] TRUE TRUE TRUE TRUE TRUE TRUE TRUE NA NA NA NA TRUE
## [49] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE NA
## [61] TRUE NA TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [73] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [85] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [97] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [109] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [133] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [145] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [157] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [169] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [181] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [193] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [205] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [217] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [229] TRUE NA NA NA NA NA NA NA NA NA NA NA
## [241] NA NA FALSE NA NA NA NA NA TRUE TRUE TRUE TRUE
## [253] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [265] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [277] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [289] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [301] TRUE TRUE TRUE NA FALSE NA TRUE TRUE TRUE TRUE TRUE TRUE
## [313] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [325] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [337] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [349] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [361] TRUE TRUE TRUE TRUE TRUE TRUE
recipes
rec <- temp1 %>%
recipe(actual~.) %>%
step_rm(forecast_acc, forecast_noaa, forecast_under, friend, year, month, day, week, temp_1, temp_2) %>%
step_meanimpute(average, actual) %>%
step_mutate(month = month(date),
day = day(date),
week = week(date))
temp1 <- rec %>% prep() %>% juice()
temp1 %>% head()## # A tibble: 6 x 6
## date average actual month day week
## <date> <dbl> <int> <dbl> <int> <dbl>
## 1 2016-01-01 45.6 45 1 1 1
## 2 2016-01-02 45.7 44 1 2 1
## 3 2016-01-03 45.8 41 1 3 1
## 4 2016-01-04 45.9 40 1 4 1
## 5 2016-01-05 46 44 1 5 1
## 6 2016-01-06 46.1 51 1 6 1
temp1 %>% is.na() %>% colSums()## date average actual month day week
## 0 0 0 0 0 0
lag1_1 <- temp1$actual %>%
lag(n = 1) %>%
replace_na(45)
lag2_1 <- temp1$actual %>%
lag(n = 2)
lag2_1[1] <- 45
lag2_1[2] <- 44
lag_dat <- data.frame(temp1 = lag1_1, temp2 = lag2_1)
temp1 <- temp1 %>%
bind_cols(lag_dat)random split
splits <- initial_split(temp1, prop = 0.7)
train <- training(splits)
test <- testing(splits)time series split
splits <- initial_time_split(temp1, prop = 0.7)
train <- training(splits)
test <- testing(splits)
train %>% tail()## # A tibble: 6 x 8
## date average actual month day week temp1 temp2
## <date> <dbl> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 2016-09-07 73 67 9 7 36 68 68
## 2 2016-09-08 72.8 72 9 8 36 67 68
## 3 2016-09-09 72.6 74 9 9 37 72 67
## 4 2016-09-10 72.3 77 9 10 37 74 72
## 5 2016-09-11 72.1 70 9 11 37 77 74
## 6 2016-09-12 71.8 74 9 12 37 70 77
test %>% head()## # A tibble: 6 x 8
## date average actual month day week temp1 temp2
## <date> <dbl> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 2016-09-13 71.5 75 9 13 37 74 70
## 2 2016-09-14 71.2 79 9 14 37 75 74
## 3 2016-09-15 71 71 9 15 37 79 75
## 4 2016-09-16 70.7 75 9 16 38 71 79
## 5 2016-09-17 70.3 68 9 17 38 75 71
## 6 2016-09-18 70 69 9 18 38 68 75
caret time series fold
initialWindow: the initial number of consecutive values in each training set sample
horizon: The number of consecutive values in test set sample
fixedWindow: A logical: if FALSE, the training set always start at the first sample and the training set size will vary over data splits.
train %>% dim()## [1] 256 8
test %>% dim()## [1] 110 8
control <- trainControl(method='timeslice',
initialWindow = 110,
horizon = 110,
fixedWindow = FALSE
)rf_gridsearch <- train(actual ~ .,
data = train,
method = 'rf',
trControl = control,
metric = 'RMSE',
tuneLength = 30) ## note: only 6 unique complexity parameters in default grid. Truncating the grid to 6 .
plot(varImp(rf_gridsearch, scale = F))pred <- predict(rf_gridsearch, newdata = test)
print(RMSE(pred, test$actual))## [1] 7.090518
# 예측 한계선
print(RMSE(test$average, test$actual))## [1] 5.515452
splits <- initial_split(temp1, prop = 0.7)
train <- training(splits)
test <- testing(splits)
set.seed(123)
control <- trainControl(method='cv',
number=5)
rf_gridsearch <- train(actual ~ .,
data = train,
method = 'rf',
trControl = control,
metric = 'RMSE',
tuneLength = 30) ## note: only 6 unique complexity parameters in default grid. Truncating the grid to 6 .
pred <- predict(rf_gridsearch, newdata = test)
print(RMSE(pred, test$actual))## [1] 4.255657
set.seed(123)
control <- trainControl(method='cv',
number=5)
svm_gridsearch <- train(actual ~ .,
data = train,
method = 'svmPoly',
trControl = control,
metric = 'RMSE',
tuneLength = 2)
#plot(varImp(svm_gridsearch), scale = F)
svm_gridsearch## Support Vector Machines with Polynomial Kernel
##
## 256 samples
## 7 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 205, 205, 204, 205, 205
## Resampling results across tuning parameters:
##
## degree scale C RMSE Rsquared MAE
## 1 0.001 0.25 10.416218 0.7580829 8.362563
## 1 0.001 0.50 9.508185 0.7681199 7.554397
## 1 0.010 0.25 5.991957 0.8140133 4.524842
## 1 0.010 0.50 5.368753 0.8185179 4.026601
## 2 0.001 0.25 9.505851 0.7684511 7.552638
## 2 0.001 0.50 8.067988 0.7847464 6.280475
## 2 0.010 0.25 5.353952 0.8193898 4.019654
## 2 0.010 0.50 5.080852 0.8234947 3.828760
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were degree = 2, scale = 0.01 and C = 0.5.
# DALEX 2.0.1
library(DALEX)
ex <- DALEX::explain(model = svm_gridsearch,
data = train[,-3],
y = train$actual)## Preparation of a new explainer is initiated
## -> model label : train.formula ( [33m default [39m )
## -> data : 256 rows 7 cols
## -> data : tibble converted into a data.frame
## -> target variable : 256 values
## -> predict function : yhat.train will be used ( [33m default [39m )
## -> predicted values : No value for predict function target column. ( [33m default [39m )
## -> model_info : package caret , ver. 6.0.88 , task regression ( [33m default [39m )
## -> predicted values : numerical, min = 41.84912 , mean = 62.22759 , max = 80.98269
## -> residual function : difference between y and yhat ( [33m default [39m )
## -> residuals : numerical, min = -11.71798 , mean = 0.4677218 , max = 18.17584
## [32m A new explainer has been created! [39m
vip <- model_parts(explainer = ex)
plot(vip)pred <- predict(svm_gridsearch, newdata = test)
print(RMSE(pred, test$actual))## [1] 4.84906
# 예측 한계선
print(RMSE(test$average, test$actual))## [1] 6.584604
SVM
Random forest
multiple regression
regression tree
Neural network
참고
R을 활용한 기계학습, 브레트 란츠
library(ResidentialEnergyConsumption)
dat <- ResidentialEnergyConsumption::elcons_15min
dat <- dat$w44[1:50, ]
date <- seq(ymd_hm('2015-01-01 00:00'),ymd_hm('2015-01-07 23:45'), by = '15 mins')
dat <- dat %>%
pivot_longer(col = -VID, names_to = 'date', values_to = 'power') %>%
select(-date)
dat <- data.frame(date = rep(date, 50)) %>%
bind_cols(dat) %>%
mutate(year = year(date),
month = month(date),
day = day(date)) %>%
select(VID, date, year, month, day, power)| 가구코드 | Date | 일사용량 total | Cluster |
|---|---|---|---|
pow <- scale(dat$power)
set.seed(123)
km.res <- kmeans(pow, 5, nstart = 25)
dat <- dat %>%
cbind(km.res$cluster)
dat <- dat %>%
rename(cluster = `km.res$cluster`)
dat %>% head()## VID date year month day power cluster
## 1 7855756 2015-01-01 00:00:00 2015 1 1 0.03 4
## 2 7855756 2015-01-01 00:15:00 2015 1 1 0.68 5
## 3 7855756 2015-01-01 00:30:00 2015 1 1 0.57 5
## 4 7855756 2015-01-01 00:45:00 2015 1 1 0.03 4
## 5 7855756 2015-01-01 01:00:00 2015 1 1 1.16 5
## 6 7855756 2015-01-01 01:15:00 2015 1 1 0.88 5
mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
result <- dat %>%
group_by(VID, year, month, day) %>%
summarise(total = sum(power, na.rm = T), clust = mode(cluster)) %>%
ungroup()
result %>% head()## # A tibble: 6 x 6
## VID year month day total clust
## <int> <dbl> <dbl> <int> <dbl> <int>
## 1 1481540 2015 1 1 38.1 4
## 2 1481540 2015 1 2 36.4 4
## 3 1481540 2015 1 3 41.1 4
## 4 1481540 2015 1 4 27.1 4
## 5 1481540 2015 1 5 27.3 4
## 6 1481540 2015 1 6 30.8 4
| 1:00 | 2:00 | 3:00 | 4:00 | 5:00 … | |
| 월 | |||||
| 화 | |||||
| 수 | |||||
| 목 | |||||
| 금 | |||||
| 토 | |||||
| 일 |
lubridate::wday
dat %>%
filter(cluster == 4) %>%
mutate(wday = wday(date, label = T),
hour = hour(date)) %>%
select(wday) %>%
str()## 'data.frame': 23248 obs. of 1 variable:
## $ wday: Ord.factor w/ 7 levels "일"<"월"<"화"<..: 5 5 5 5 5 5 5 5 5 5 ...
dat %>%
filter(cluster == 4) %>%
mutate(wday = wday(date, label = T, week_start = 1),
hour = hour(date)) %>%
select(wday) %>%
str()## 'data.frame': 23248 obs. of 1 variable:
## $ wday: Ord.factor w/ 7 levels "월"<"화"<"수"<..: 4 4 4 4 4 4 4 4 4 4 ...
scale_y_discrete
dat %>%
filter(cluster == 4) %>%
mutate(wday = wday(date, label = T, week_start = 1),
hour = hour(date),
wday = as.factor(wday)) %>%
group_by(wday, hour) %>%
summarise(mean_w = mean(power)) %>%
ungroup() %>%
ggplot(aes(x = hour, y = wday, fill = mean_w)) +
geom_tile()dat %>%
filter(cluster == 4) %>%
mutate(wday = wday(date, label = T, week_start = 1),
hour = hour(date),
wday = as.factor(wday)) %>%
group_by(wday, hour) %>%
summarise(mean_w = mean(power)) %>%
ungroup() %>%
ggplot(aes(x = hour, y = wday, fill = mean_w)) +
geom_tile() +
scale_y_discrete(limits = c("일", "토", "금", "목", "수", "화", "월"))